{ "cells": [ { "cell_type": "code", "execution_count": 237, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal length in cmsepal width in cmpetal length in cmpetal width in cmclass
05.13.51.40.2Iris-setosa
14.93.01.40.2Iris-setosa
24.73.21.30.2Iris-setosa
34.63.11.50.2Iris-setosa
45.03.61.40.2Iris-setosa
\n", "
" ], "text/plain": [ " sepal length in cm sepal width in cm petal length in cm \\\n", "0 5.1 3.5 1.4 \n", "1 4.9 3.0 1.4 \n", "2 4.7 3.2 1.3 \n", "3 4.6 3.1 1.5 \n", "4 5.0 3.6 1.4 \n", "\n", " petal width in cm class \n", "0 0.2 Iris-setosa \n", "1 0.2 Iris-setosa \n", "2 0.2 Iris-setosa \n", "3 0.2 Iris-setosa \n", "4 0.2 Iris-setosa " ] }, "execution_count": 237, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Import library and data\n", "import pandas as pd\n", "import numpy as np\n", "import os\n", "import argparse\n", "import random\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "%matplotlib inline\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.model_selection import train_test_split\n", "from collections import Counter\n", "import math\n", "import operator\n", "from scipy.spatial.distance import cosine\n", "import csv\n", "\n", "os.getcwd()\n", "data=pd.read_csv('C:/Users/mxm5116/Desktop/Data Mining/iris.csv')\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 139, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 139, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Lets see the visualization of different class\n", "%matplotlib inline\n", "# Let see the sepal pattern\n", "ax = data[data['class'] == 'Iris-setosa'].plot.scatter(x='sepal length in cm', y='sepal width in cm', c='blue', label='Iris-setosa')\n", "ax = data[data['class'] == 'Iris-versicolor'].plot.scatter(x='sepal length in cm', y='sepal width in cm', c='orange', label='Iris-versicolor', ax=ax)\n", "ax = data[data['class'] == 'Iris-virginica'].plot.scatter(x='sepal length in cm', y='sepal width in cm', c='red', label='Iris-virginica', ax=ax)\n", "ax\n" ] }, { "cell_type": "code", "execution_count": 140, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 140, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# Let see the petal pattern\n", "\n", "ax = data[data['class'] == 'Iris-setosa'].plot.scatter(x='petal length in cm', y='petal width in cm', c='blue', label='Iris-setosa')\n", "ax = data[data['class'] == 'Iris-versicolor'].plot.scatter(x='petal length in cm', y='petal width in cm', c='orange', label='Iris-versicolor', ax=ax)\n", "ax = data[data['class'] == 'Iris-virginica'].plot.scatter(x='petal length in cm', y='petal width in cm', c='red', label='Iris-virginica', ax=ax)\n", "ax" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# a.Divide the dataset as development and test. Because kNN does not require training you don’t have a train dataset. Make sure randomly divide the dataset \n" ] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[6.3 2.7 4.9 1.8]\n", " [4.8 3.4 1.9 0.2]\n", " [5. 3. 1.6 0.2]\n", " [5.1 3.3 1.7 0.5]\n", " [5.6 2.7 4.2 1.3]\n", " [5.1 3.4 1.5 0.2]\n", " [5.7 3. 4.2 1.2]\n", " [7.7 3.8 6.7 2.2]\n", " [4.6 3.2 1.4 0.2]\n", " [6.2 2.9 4.3 1.3]\n", " [5.7 2.5 5. 2. ]\n", " [5.5 4.2 1.4 0.2]\n", " [6. 3. 4.8 1.8]\n", " [5.8 2.7 5.1 1.9]\n", " [6. 2.2 4. 1. ]\n", " [5.4 3. 4.5 1.5]\n", " [6.2 3.4 5.4 2.3]\n", " [5.5 2.3 4. 1.3]\n", " [5.4 3.9 1.7 0.4]\n", " [5. 2.3 3.3 1. ]\n", " [6.4 2.7 5.3 1.9]\n", " [5. 3.3 1.4 0.2]\n", " [5. 3.2 1.2 0.2]\n", " [5.5 2.4 3.8 1.1]\n", " [6.7 3. 5. 1.7]\n", " [4.9 3.1 1.5 0.1]\n", " [5.8 2.8 5.1 2.4]\n", " [5. 3.4 1.5 0.2]\n", " [5. 3.5 1.6 0.6]\n", " [5.9 3.2 4.8 1.8]\n", " [5.1 2.5 3. 1.1]\n", " [6.9 3.2 5.7 2.3]\n", " [6. 2.7 5.1 1.6]\n", " [6.1 2.6 5.6 1.4]\n", " [7.7 3. 6.1 2.3]\n", " [5.5 2.5 4. 1.3]\n", " [4.4 2.9 1.4 0.2]\n", " [4.3 3. 1.1 0.1]\n", " [6. 2.2 5. 1.5]\n", " [7.2 3.2 6. 1.8]\n", " [4.6 3.1 1.5 0.2]\n", " [5.1 3.5 1.4 0.3]\n", " [4.4 3. 1.3 0.2]\n", " [6.3 2.5 4.9 1.5]\n", " [6.3 3.4 5.6 2.4]\n", " [4.6 3.4 1.4 0.3]\n", " [6.8 3. 5.5 2.1]\n", " [6.3 3.3 6. 2.5]\n", " [4.7 3.2 1.3 0.2]\n", " [6.1 2.9 4.7 1.4]\n", " [6.5 2.8 4.6 1.5]\n", " [6.2 2.8 4.8 1.8]\n", " [7. 3.2 4.7 1.4]\n", " [6.4 3.2 5.3 2.3]\n", " [5.1 3.8 1.6 0.2]\n", " [6.9 3.1 5.4 2.1]\n", " [5.9 3. 4.2 1.5]\n", " [6.5 3. 5.2 2. ]\n", " [5.7 2.6 3.5 1. ]\n", " [5.2 2.7 3.9 1.4]\n", " [6.1 3. 4.6 1.4]\n", " [4.5 2.3 1.3 0.3]\n", " [6.6 2.9 4.6 1.3]\n", " [5.5 2.6 4.4 1.2]\n", " [5.3 3.7 1.5 0.2]\n", " [5.6 3. 4.1 1.3]\n", " [7.3 2.9 6.3 1.8]\n", " [6.7 3.3 5.7 2.1]\n", " [5.1 3.7 1.5 0.4]\n", " [4.9 2.4 3.3 1. ]\n", " [6.7 3.3 5.7 2.5]\n", " [7.2 3. 5.8 1.6]\n", " [4.9 3.1 1.5 0.1]\n", " [6.7 3.1 5.6 2.4]\n", " [4.9 3. 1.4 0.2]\n", " [6.9 3.1 4.9 1.5]\n", " [7.4 2.8 6.1 1.9]\n", " [6.3 2.9 5.6 1.8]\n", " [5.7 2.8 4.1 1.3]\n", " [6.5 3. 5.5 1.8]\n", " [6.3 2.3 4.4 1.3]\n", " [6.4 2.9 4.3 1.3]\n", " [5.6 2.8 4.9 2. ]\n", " [5.9 3. 5.1 1.8]\n", " [5.4 3.4 1.7 0.2]\n", " [6.1 2.8 4. 1.3]\n", " [4.9 2.5 4.5 1.7]\n", " [5.8 4. 1.2 0.2]\n", " [5.8 2.6 4. 1.2]\n", " [7.1 3. 5.9 2.1]]\n", "[[6.1 2.8 4.7 1.2]\n", " [5.7 3.8 1.7 0.3]\n", " [7.7 2.6 6.9 2.3]\n", " [6. 2.9 4.5 1.5]\n", " [6.8 2.8 4.8 1.4]\n", " [5.4 3.4 1.5 0.4]\n", " [5.6 2.9 3.6 1.3]\n", " [6.9 3.1 5.1 2.3]\n", " [6.2 2.2 4.5 1.5]\n", " [5.8 2.7 3.9 1.2]\n", " [6.5 3.2 5.1 2. ]\n", " [4.8 3. 1.4 0.1]\n", " [5.5 3.5 1.3 0.2]\n", " [4.9 3.1 1.5 0.1]\n", " [5.1 3.8 1.5 0.3]\n", " [6.3 3.3 4.7 1.6]\n", " [6.5 3. 5.8 2.2]\n", " [5.6 2.5 3.9 1.1]\n", " [5.7 2.8 4.5 1.3]\n", " [6.4 2.8 5.6 2.2]\n", " [4.7 3.2 1.6 0.2]\n", " [6.1 3. 4.9 1.8]\n", " [5. 3.4 1.6 0.4]\n", " [6.4 2.8 5.6 2.1]\n", " [7.9 3.8 6.4 2. ]\n", " [6.7 3. 5.2 2.3]\n", " [6.7 2.5 5.8 1.8]\n", " [6.8 3.2 5.9 2.3]\n", " [4.8 3. 1.4 0.3]\n", " [4.8 3.1 1.6 0.2]\n", " [4.6 3.6 1. 0.2]\n", " [5.7 4.4 1.5 0.4]\n", " [6.7 3.1 4.4 1.4]\n", " [4.8 3.4 1.6 0.2]\n", " [4.4 3.2 1.3 0.2]\n", " [6.3 2.5 5. 1.9]\n", " [6.4 3.2 4.5 1.5]\n", " [5.2 3.5 1.5 0.2]\n", " [5. 3.6 1.4 0.2]\n", " [5.2 4.1 1.5 0.1]\n", " [5.8 2.7 5.1 1.9]\n", " [6. 3.4 4.5 1.6]\n", " [6.7 3.1 4.7 1.5]\n", " [5.4 3.9 1.3 0.4]\n", " [5.4 3.7 1.5 0.2]\n", " [5.5 2.4 3.7 1. ]\n", " [6.3 2.8 5.1 1.5]\n", " [6.4 3.1 5.5 1.8]\n", " [6.6 3. 4.4 1.4]\n", " [7.2 3.6 6.1 2.5]\n", " [5.7 2.9 4.2 1.3]\n", " [7.6 3. 6.6 2.1]\n", " [5.6 3. 4.5 1.5]\n", " [5.1 3.5 1.4 0.2]\n", " [7.7 2.8 6.7 2. ]\n", " [5.8 2.7 4.1 1. ]\n", " [5.2 3.4 1.4 0.2]\n", " [5. 3.5 1.3 0.3]\n", " [5.1 3.8 1.9 0.4]\n", " [5. 2. 3.5 1. ]]\n", "['Iris-virginica' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'\n", " 'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica'\n", " 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica' 'Iris-setosa'\n", " 'Iris-virginica' 'Iris-virginica' 'Iris-versicolor' 'Iris-versicolor'\n", " 'Iris-virginica' 'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor'\n", " 'Iris-virginica' 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor'\n", " 'Iris-versicolor' 'Iris-setosa' 'Iris-virginica' 'Iris-setosa'\n", " 'Iris-setosa' 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica'\n", " 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica' 'Iris-versicolor'\n", " 'Iris-setosa' 'Iris-setosa' 'Iris-virginica' 'Iris-virginica'\n", " 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor'\n", " 'Iris-virginica' 'Iris-setosa' 'Iris-virginica' 'Iris-virginica'\n", " 'Iris-setosa' 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica'\n", " 'Iris-versicolor' 'Iris-virginica' 'Iris-setosa' 'Iris-virginica'\n", " 'Iris-versicolor' 'Iris-virginica' 'Iris-versicolor' 'Iris-versicolor'\n", " 'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor' 'Iris-versicolor'\n", " 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica'\n", " 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica'\n", " 'Iris-setosa' 'Iris-virginica' 'Iris-setosa' 'Iris-versicolor'\n", " 'Iris-virginica' 'Iris-virginica' 'Iris-versicolor' 'Iris-virginica'\n", " 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica'\n", " 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica' 'Iris-setosa'\n", " 'Iris-versicolor' 'Iris-virginica']\n", "['Iris-versicolor' 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor'\n", " 'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica'\n", " 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica' 'Iris-setosa'\n", " 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor'\n", " 'Iris-virginica' 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica'\n", " 'Iris-setosa' 'Iris-virginica' 'Iris-setosa' 'Iris-virginica'\n", " 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'\n", " 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor'\n", " 'Iris-setosa' 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor'\n", " 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-virginica'\n", " 'Iris-versicolor' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'\n", " 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica' 'Iris-versicolor'\n", " 'Iris-virginica' 'Iris-versicolor' 'Iris-virginica' 'Iris-versicolor'\n", " 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor' 'Iris-setosa'\n", " 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor']\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\mxm5116\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:2: DeprecationWarning: \n", ".ix is deprecated. Please use\n", ".loc for label based indexing or\n", ".iloc for positional indexing\n", "\n", "See the documentation here:\n", "http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated\n", " \n" ] } ], "source": [ "# create design matrix X and target vector y\n", "X = np.array(data.ix[:, 0:4]) \n", "y = np.array(data['class']) \n", "\n", "Dev_data_X, \\\n", "test_data_X, \\\n", "Dev_data_y, \\\n", "test_data_y = train_test_split(X, y, test_size=0.40, random_state=42)\n", "print(Dev_data_X)\n", "print(test_data_X)\n", "print(Dev_data_y)\n", "print(test_data_y)" ] }, { "cell_type": "code", "execution_count": 142, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(90, 4)\n", "(60, 4)\n", "(90,)\n", "(60,)\n" ] } ], "source": [ "print(Dev_data_X.shape)\n", "print(test_data_X.shape)\n", "print(Dev_data_y.shape)\n", "print(test_data_y.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# b.\timplement kNN using the following hyperparameters\n", "* number of neighbor K \n", " * 1,3,5,7\n", "* distance metric\n", "•\teuclidean distance\n", "•\tnormalized euclidean distance\n", "•\tcosine similarity\n", "\n", "\n", "# c. Using the development dataset,\n", "\n", " Calculate accuracy by iterating all of the development data point\n", " Find optimal hyperparameters\n", "\n", " * Draw bar charts for accuracy\n", " \n", " \n", "# d. Using the test dataset\n", "\n", " * Use the optimal hyperparameters you found in the step c, and use it to calculate the final accuracy. \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Lets Find the best Hyperparameter for Iris data out of K=1,3,5,7,9,11,13,15,17,19,21" ] }, { "cell_type": "code", "execution_count": 143, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\mxm5116\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.\n", " warnings.warn(CV_WARNING, FutureWarning)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Best Hyper Parameters:\n", " {'algorithm': 'auto', 'leaf_size': 1, 'n_jobs': -1, 'n_neighbors': 7, 'weights': 'distance'}\n", "Accuracy: 0.9833333333333333\n", "Confusion Metrix:\n", " [[23 0 0]\n", " [ 0 19 1]\n", " [ 0 0 17]]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\mxm5116\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_search.py:813: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.\n", " DeprecationWarning)\n" ] } ], "source": [ "# Now lets Run KNN with a range of K value\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn import metrics\n", "\n", "params = {'n_neighbors':[1,3,5,7,9,11,13,15,17,19,21],\n", " 'leaf_size':[1,2,3,5,10,15,20,30],\n", " 'weights':['uniform', 'distance'],\n", " 'algorithm':['auto', 'ball_tree','kd_tree','brute'],\n", " 'n_jobs':[-1]}\n", "Dev_model = KNeighborsClassifier(n_jobs=-1)\n", "KNN_model = GridSearchCV(Dev_model, param_grid=params, n_jobs=1)\n", "KNN_model.fit(Dev_data_X,Dev_data_y)\n", "print(\"Best Hyper Parameters:\\n\",KNN_model.best_params_)\n", "prediction=KNN_model.predict(test_data_X)\n", "print(\"Accuracy:\",metrics.accuracy_score(prediction,test_data_y))\n", "print(\"Confusion Metrix:\\n\",metrics.confusion_matrix(prediction,test_data_y))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# So,from above program it was found that best hyperparameter,K =7, Now we will apply K=7 in our euclidean distance KNN" ] }, { "cell_type": "code", "execution_count": 144, "metadata": {}, "outputs": [], "source": [ "# Now Apply euclidean distance in KNN step by step \n", "def train(Dev_data_X, Dev_data_y):\n", " return\n", "\n", "def predict(Dev_data_X, Dev_data_y, test_data_X, k):\n", " \n", "\n", " distances = []\n", " targets = []\n", "\n", " for i in range(len(Dev_data_X)):\n", " distances.append([np.sqrt(np.sum(np.square(test_data_X - Dev_data_X[i, :]))), i])\n", "\n", " distances = sorted(distances)\n", "\n", " # make a list of the k neighbors' targets\n", " for i in range(k):\n", " index = distances[i][1]\n", " targets.append(Dev_data_y[index])\n", "\n", " # return most common target\n", " return Counter(targets).most_common(1)[0][0]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "def k_nearest_neighbor(Dev_data_X, Dev_data_y, test_data_X, k):\n", " # train on the develop data\n", " train(Dev_data_X, Dev_data_y)\n", "\n", " # loop over all observations\n", " predictions = []\n", " for i in range(len(test_data_X)):\n", " predictions.append(predict(Dev_data_X, Dev_data_y, test_data_X[i, :], k))\n", "\n", " return np.asarray(predictions)\n" ] }, { "cell_type": "code", "execution_count": 242, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " accuracy1 98.33333333333333\n", " accuracy3 98.33333333333333\n", " accuracy5 98.33333333333333\n", " accuracy7 98.33333333333333\n", " accuracy9 98.33333333333333\n", " accuracy11 100.0\n", " accuracy13 100.0\n" ] } ], "source": [ "# Now make the predictions when, k=1\n", "predictions = k_nearest_neighbor(Dev_data_X, Dev_data_y, test_data_X, 1)\n", "\n", "# Evaluate accuracy\n", "accuracy = accuracy_score(test_data_y, predictions)\n", "print(\" accuracy1 {}\".format(100*accuracy))\n", "\n", "# Now make the predictions when, k=3\n", "predictions = k_nearest_neighbor(Dev_data_X, Dev_data_y, test_data_X, 3)\n", "\n", "# Evaluate accuracy\n", "accuracy = accuracy_score(test_data_y, predictions)\n", "print(\" accuracy3 {}\".format(100*accuracy))\n", "\n", "# Now make the predictions when, k=5\n", "predictions = k_nearest_neighbor(Dev_data_X, Dev_data_y, test_data_X, 5)\n", "\n", "# Evaluate accuracy\n", "accuracy = accuracy_score(test_data_y, predictions)\n", "print(\" accuracy5 {}\".format(100*accuracy))\n", "\n", "# Now make the predictions when, k=7\n", "predictions = k_nearest_neighbor(Dev_data_X, Dev_data_y, test_data_X, 5)\n", "\n", "# Evaluate accuracy\n", "accuracy = accuracy_score(test_data_y, predictions)\n", "print(\" accuracy7 {}\".format(100*accuracy))\n", "\n", "# Now make the predictions when, k=9\n", "predictions = k_nearest_neighbor(Dev_data_X, Dev_data_y, test_data_X, 9)\n", "\n", "# Evaluate accuracy\n", "accuracy = accuracy_score(test_data_y, predictions)\n", "print(\" accuracy9 {}\".format(100*accuracy))\n", "\n", "# Now make the predictions when, k=11\n", "predictions = k_nearest_neighbor(Dev_data_X, Dev_data_y, test_data_X, 11)\n", "\n", "# Evaluate accuracy\n", "accuracy = accuracy_score(test_data_y, predictions)\n", "print(\" accuracy11 {}\".format(100*accuracy))\n", "\n", "# Now make the predictions when, k=13\n", "predictions = k_nearest_neighbor(Dev_data_X, Dev_data_y, test_data_X, 13)\n", "\n", "# Evaluate accuracy\n", "accuracy = accuracy_score(test_data_y, predictions)\n", "print(\" accuracy13 {}\".format(100*accuracy))" ] }, { "cell_type": "code", "execution_count": 207, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt; plt.rcdefaults()\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "objects=('k=1','k=3','k=5','k=7','k=9','k=11','k=13')\n", "y_pos = np.arange(len(objects))\n", "performance = [98.33,98.33,98.33,98.33,98.33,100.0,100.0]\n", "plt.bar(y_pos, performance, align='center', alpha=0.5,color='green')\n", "plt.xticks(y_pos, objects)\n", "plt.ylabel('Performance')\n", "plt.title('Performance with K value')\n", "plt.show()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# So at the starting, we have seen that K=7 is best hyperparameter. Now from the above manual coding, we have found that upto k=9, the accuracy is same for euclidean distance. So we will take k=7" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Now use normalized euclidean distance \n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[0.64102564 0.43589744 0.16666667 0.01282051]\n", " [0.61538462 0.37179487 0.16666667 0.01282051]\n", " [0.58974359 0.3974359 0.15384615 0.01282051]\n", " [0.57692308 0.38461538 0.17948718 0.01282051]\n", " [0.62820513 0.44871795 0.16666667 0.01282051]\n", " [0.67948718 0.48717949 0.20512821 0.03846154]\n", " [0.57692308 0.42307692 0.16666667 0.02564103]\n", " [0.62820513 0.42307692 0.17948718 0.01282051]\n", " [0.55128205 0.35897436 0.16666667 0.01282051]\n", " [0.61538462 0.38461538 0.17948718 0. ]\n", " [0.67948718 0.46153846 0.17948718 0.01282051]\n", " [0.6025641 0.42307692 0.19230769 0.01282051]\n", " [0.6025641 0.37179487 0.16666667 0. ]\n", " [0.53846154 0.37179487 0.12820513 0. ]\n", " [0.73076923 0.5 0.14102564 0.01282051]\n", " [0.71794872 0.55128205 0.17948718 0.03846154]\n", " [0.67948718 0.48717949 0.15384615 0.03846154]\n", " [0.64102564 0.43589744 0.16666667 0.02564103]\n", " [0.71794872 0.47435897 0.20512821 0.02564103]\n", " [0.64102564 0.47435897 0.17948718 0.02564103]\n", " [0.67948718 0.42307692 0.20512821 0.01282051]\n", " [0.64102564 0.46153846 0.17948718 0.03846154]\n", " [0.57692308 0.44871795 0.11538462 0.01282051]\n", " [0.64102564 0.41025641 0.20512821 0.05128205]\n", " [0.6025641 0.42307692 0.23076923 0.01282051]\n", " [0.62820513 0.37179487 0.19230769 0.01282051]\n", " [0.62820513 0.42307692 0.19230769 0.03846154]\n", " [0.65384615 0.43589744 0.17948718 0.01282051]\n", " [0.65384615 0.42307692 0.16666667 0.01282051]\n", " [0.58974359 0.3974359 0.19230769 0.01282051]\n", " [0.6025641 0.38461538 0.19230769 0.01282051]\n", " [0.67948718 0.42307692 0.17948718 0.03846154]\n", " [0.65384615 0.51282051 0.17948718 0. ]\n", " [0.69230769 0.52564103 0.16666667 0.01282051]\n", " [0.61538462 0.38461538 0.17948718 0. ]\n", " [0.62820513 0.3974359 0.14102564 0.01282051]\n", " [0.69230769 0.43589744 0.15384615 0.01282051]\n", " [0.61538462 0.38461538 0.17948718 0. ]\n", " [0.55128205 0.37179487 0.15384615 0.01282051]\n", " [0.64102564 0.42307692 0.17948718 0.01282051]\n", " [0.62820513 0.43589744 0.15384615 0.02564103]\n", " [0.56410256 0.28205128 0.15384615 0.02564103]\n", " [0.55128205 0.3974359 0.15384615 0.01282051]\n", " [0.62820513 0.43589744 0.19230769 0.06410256]\n", " [0.64102564 0.47435897 0.23076923 0.03846154]\n", " [0.6025641 0.37179487 0.16666667 0.02564103]\n", " [0.64102564 0.47435897 0.19230769 0.01282051]\n", " [0.57692308 0.3974359 0.16666667 0.01282051]\n", " [0.66666667 0.46153846 0.17948718 0.01282051]\n", " [0.62820513 0.41025641 0.16666667 0.01282051]\n", " [0.88461538 0.3974359 0.58974359 0.16666667]\n", " [0.80769231 0.3974359 0.56410256 0.17948718]\n", " [0.87179487 0.38461538 0.61538462 0.17948718]\n", " [0.69230769 0.28205128 0.5 0.15384615]\n", " [0.82051282 0.34615385 0.57692308 0.17948718]\n", " [0.71794872 0.34615385 0.56410256 0.15384615]\n", " [0.79487179 0.41025641 0.58974359 0.19230769]\n", " [0.61538462 0.29487179 0.41025641 0.11538462]\n", " [0.83333333 0.35897436 0.57692308 0.15384615]\n", " [0.65384615 0.33333333 0.48717949 0.16666667]\n", " [0.62820513 0.24358974 0.43589744 0.11538462]\n", " [0.74358974 0.37179487 0.52564103 0.17948718]\n", " [0.75641026 0.26923077 0.5 0.11538462]\n", " [0.76923077 0.35897436 0.58974359 0.16666667]\n", " [0.70512821 0.35897436 0.44871795 0.15384615]\n", " [0.84615385 0.38461538 0.55128205 0.16666667]\n", " [0.70512821 0.37179487 0.56410256 0.17948718]\n", " [0.73076923 0.33333333 0.51282051 0.11538462]\n", " [0.78205128 0.26923077 0.56410256 0.17948718]\n", " [0.70512821 0.30769231 0.48717949 0.12820513]\n", " [0.74358974 0.3974359 0.6025641 0.21794872]\n", " [0.76923077 0.34615385 0.5 0.15384615]\n", " [0.79487179 0.30769231 0.61538462 0.17948718]\n", " [0.76923077 0.34615385 0.58974359 0.14102564]\n", " [0.80769231 0.35897436 0.53846154 0.15384615]\n", " [0.83333333 0.37179487 0.55128205 0.16666667]\n", " [0.85897436 0.34615385 0.6025641 0.16666667]\n", " [0.84615385 0.37179487 0.62820513 0.20512821]\n", " [0.75641026 0.35897436 0.56410256 0.17948718]\n", " [0.71794872 0.32051282 0.43589744 0.11538462]\n", " [0.69230769 0.29487179 0.47435897 0.12820513]\n", " [0.69230769 0.29487179 0.46153846 0.11538462]\n", " [0.73076923 0.33333333 0.48717949 0.14102564]\n", " [0.75641026 0.33333333 0.64102564 0.19230769]\n", " [0.67948718 0.37179487 0.56410256 0.17948718]\n", " [0.75641026 0.42307692 0.56410256 0.19230769]\n", " [0.84615385 0.38461538 0.58974359 0.17948718]\n", " [0.79487179 0.28205128 0.55128205 0.15384615]\n", " [0.70512821 0.37179487 0.51282051 0.15384615]\n", " [0.69230769 0.30769231 0.5 0.15384615]\n", " [0.69230769 0.32051282 0.55128205 0.14102564]\n", " [0.76923077 0.37179487 0.57692308 0.16666667]\n", " [0.73076923 0.32051282 0.5 0.14102564]\n", " [0.62820513 0.28205128 0.41025641 0.11538462]\n", " [0.70512821 0.33333333 0.52564103 0.15384615]\n", " [0.71794872 0.37179487 0.52564103 0.14102564]\n", " [0.71794872 0.35897436 0.52564103 0.15384615]\n", " [0.78205128 0.35897436 0.53846154 0.15384615]\n", " [0.64102564 0.30769231 0.37179487 0.12820513]\n", " [0.71794872 0.34615385 0.51282051 0.15384615]\n", " [0.79487179 0.41025641 0.75641026 0.30769231]\n", " [0.73076923 0.33333333 0.64102564 0.23076923]\n", " [0.8974359 0.37179487 0.74358974 0.25641026]\n", " [0.79487179 0.35897436 0.70512821 0.21794872]\n", " [0.82051282 0.37179487 0.73076923 0.26923077]\n", " [0.96153846 0.37179487 0.83333333 0.25641026]\n", " [0.61538462 0.30769231 0.56410256 0.20512821]\n", " [0.92307692 0.35897436 0.79487179 0.21794872]\n", " [0.84615385 0.30769231 0.73076923 0.21794872]\n", " [0.91025641 0.44871795 0.76923077 0.30769231]\n", " [0.82051282 0.3974359 0.64102564 0.24358974]\n", " [0.80769231 0.33333333 0.66666667 0.23076923]\n", " [0.85897436 0.37179487 0.69230769 0.25641026]\n", " [0.71794872 0.30769231 0.62820513 0.24358974]\n", " [0.73076923 0.34615385 0.64102564 0.29487179]\n", " [0.80769231 0.3974359 0.66666667 0.28205128]\n", " [0.82051282 0.37179487 0.69230769 0.21794872]\n", " [0.97435897 0.47435897 0.84615385 0.26923077]\n", " [0.97435897 0.32051282 0.87179487 0.28205128]\n", " [0.75641026 0.26923077 0.62820513 0.17948718]\n", " [0.87179487 0.3974359 0.71794872 0.28205128]\n", " [0.70512821 0.34615385 0.61538462 0.24358974]\n", " [0.97435897 0.34615385 0.84615385 0.24358974]\n", " [0.79487179 0.33333333 0.61538462 0.21794872]\n", " [0.84615385 0.41025641 0.71794872 0.25641026]\n", " [0.91025641 0.3974359 0.75641026 0.21794872]\n", " [0.78205128 0.34615385 0.6025641 0.21794872]\n", " [0.76923077 0.37179487 0.61538462 0.21794872]\n", " [0.80769231 0.34615385 0.70512821 0.25641026]\n", " [0.91025641 0.37179487 0.73076923 0.19230769]\n", " [0.93589744 0.34615385 0.76923077 0.23076923]\n", " [1. 0.47435897 0.80769231 0.24358974]\n", " [0.80769231 0.34615385 0.70512821 0.26923077]\n", " [0.79487179 0.34615385 0.64102564 0.17948718]\n", " [0.76923077 0.32051282 0.70512821 0.16666667]\n", " [0.97435897 0.37179487 0.76923077 0.28205128]\n", " [0.79487179 0.42307692 0.70512821 0.29487179]\n", " [0.80769231 0.38461538 0.69230769 0.21794872]\n", " [0.75641026 0.37179487 0.6025641 0.21794872]\n", " [0.87179487 0.38461538 0.67948718 0.25641026]\n", " [0.84615385 0.38461538 0.70512821 0.29487179]\n", " [0.87179487 0.38461538 0.64102564 0.28205128]\n", " [0.73076923 0.33333333 0.64102564 0.23076923]\n", " [0.85897436 0.3974359 0.74358974 0.28205128]\n", " [0.84615385 0.41025641 0.71794872 0.30769231]\n", " [0.84615385 0.37179487 0.65384615 0.28205128]\n", " [0.79487179 0.30769231 0.62820513 0.23076923]\n", " [0.82051282 0.37179487 0.65384615 0.24358974]\n", " [0.78205128 0.42307692 0.67948718 0.28205128]\n", " [0.74358974 0.37179487 0.64102564 0.21794872]]\n" ] } ], "source": [ "# normalizing the data\n", "normalize = (X - X.min()) / (X.max() - X.min())\n", "print(normalize)\n", "\n", "Dev_norm_data_X, \\\n", "test_norm_data_X, \\\n", "Dev_norm_data_y, \\\n", "test_norm_data_y = train_test_split(normalize, y, test_size=0.40, random_state=42)\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[0.79487179 0.33333333 0.61538462 0.21794872]\n", " [0.6025641 0.42307692 0.23076923 0.01282051]\n", " [0.62820513 0.37179487 0.19230769 0.01282051]\n", " [0.64102564 0.41025641 0.20512821 0.05128205]\n", " [0.70512821 0.33333333 0.52564103 0.15384615]\n", " [0.64102564 0.42307692 0.17948718 0.01282051]\n", " [0.71794872 0.37179487 0.52564103 0.14102564]\n", " [0.97435897 0.47435897 0.84615385 0.26923077]\n", " [0.57692308 0.3974359 0.16666667 0.01282051]\n", " [0.78205128 0.35897436 0.53846154 0.15384615]\n", " [0.71794872 0.30769231 0.62820513 0.24358974]\n", " [0.69230769 0.52564103 0.16666667 0.01282051]\n", " [0.75641026 0.37179487 0.6025641 0.21794872]\n", " [0.73076923 0.33333333 0.64102564 0.23076923]\n", " [0.75641026 0.26923077 0.5 0.11538462]\n", " [0.67948718 0.37179487 0.56410256 0.17948718]\n", " [0.78205128 0.42307692 0.67948718 0.28205128]\n", " [0.69230769 0.28205128 0.5 0.15384615]\n", " [0.67948718 0.48717949 0.20512821 0.03846154]\n", " [0.62820513 0.28205128 0.41025641 0.11538462]\n", " [0.80769231 0.33333333 0.66666667 0.23076923]\n", " [0.62820513 0.41025641 0.16666667 0.01282051]\n", " [0.62820513 0.3974359 0.14102564 0.01282051]\n", " [0.69230769 0.29487179 0.47435897 0.12820513]\n", " [0.84615385 0.37179487 0.62820513 0.20512821]\n", " [0.61538462 0.38461538 0.17948718 0. ]\n", " [0.73076923 0.34615385 0.64102564 0.29487179]\n", " [0.62820513 0.42307692 0.17948718 0.01282051]\n", " [0.62820513 0.43589744 0.19230769 0.06410256]\n", " [0.74358974 0.3974359 0.6025641 0.21794872]\n", " [0.64102564 0.30769231 0.37179487 0.12820513]\n", " [0.87179487 0.3974359 0.71794872 0.28205128]\n", " [0.75641026 0.33333333 0.64102564 0.19230769]\n", " [0.76923077 0.32051282 0.70512821 0.16666667]\n", " [0.97435897 0.37179487 0.76923077 0.28205128]\n", " [0.69230769 0.30769231 0.5 0.15384615]\n", " [0.55128205 0.35897436 0.16666667 0.01282051]\n", " [0.53846154 0.37179487 0.12820513 0. ]\n", " [0.75641026 0.26923077 0.62820513 0.17948718]\n", " [0.91025641 0.3974359 0.75641026 0.21794872]\n", " [0.57692308 0.38461538 0.17948718 0.01282051]\n", " [0.64102564 0.43589744 0.16666667 0.02564103]\n", " [0.55128205 0.37179487 0.15384615 0.01282051]\n", " [0.79487179 0.30769231 0.61538462 0.17948718]\n", " [0.79487179 0.42307692 0.70512821 0.29487179]\n", " [0.57692308 0.42307692 0.16666667 0.02564103]\n", " [0.85897436 0.37179487 0.69230769 0.25641026]\n", " [0.79487179 0.41025641 0.75641026 0.30769231]\n", " [0.58974359 0.3974359 0.15384615 0.01282051]\n", " [0.76923077 0.35897436 0.58974359 0.16666667]\n", " [0.82051282 0.34615385 0.57692308 0.17948718]\n", " [0.78205128 0.34615385 0.6025641 0.21794872]\n", " [0.88461538 0.3974359 0.58974359 0.16666667]\n", " [0.80769231 0.3974359 0.66666667 0.28205128]\n", " [0.64102564 0.47435897 0.19230769 0.01282051]\n", " [0.87179487 0.38461538 0.67948718 0.25641026]\n", " [0.74358974 0.37179487 0.52564103 0.17948718]\n", " [0.82051282 0.37179487 0.65384615 0.24358974]\n", " [0.71794872 0.32051282 0.43589744 0.11538462]\n", " [0.65384615 0.33333333 0.48717949 0.16666667]\n", " [0.76923077 0.37179487 0.57692308 0.16666667]\n", " [0.56410256 0.28205128 0.15384615 0.02564103]\n", " [0.83333333 0.35897436 0.57692308 0.15384615]\n", " [0.69230769 0.32051282 0.55128205 0.14102564]\n", " [0.66666667 0.46153846 0.17948718 0.01282051]\n", " [0.70512821 0.37179487 0.51282051 0.15384615]\n", " [0.92307692 0.35897436 0.79487179 0.21794872]\n", " [0.84615385 0.41025641 0.71794872 0.25641026]\n", " [0.64102564 0.46153846 0.17948718 0.03846154]\n", " [0.61538462 0.29487179 0.41025641 0.11538462]\n", " [0.84615385 0.41025641 0.71794872 0.30769231]\n", " [0.91025641 0.37179487 0.73076923 0.19230769]\n", " [0.61538462 0.38461538 0.17948718 0. ]\n", " [0.84615385 0.38461538 0.70512821 0.29487179]\n", " [0.61538462 0.37179487 0.16666667 0.01282051]\n", " [0.87179487 0.38461538 0.61538462 0.17948718]\n", " [0.93589744 0.34615385 0.76923077 0.23076923]\n", " [0.79487179 0.35897436 0.70512821 0.21794872]\n", " [0.71794872 0.34615385 0.51282051 0.15384615]\n", " [0.82051282 0.37179487 0.69230769 0.21794872]\n", " [0.79487179 0.28205128 0.55128205 0.15384615]\n", " [0.80769231 0.35897436 0.53846154 0.15384615]\n", " [0.70512821 0.34615385 0.61538462 0.24358974]\n", " [0.74358974 0.37179487 0.64102564 0.21794872]\n", " [0.67948718 0.42307692 0.20512821 0.01282051]\n", " [0.76923077 0.34615385 0.5 0.15384615]\n", " [0.61538462 0.30769231 0.56410256 0.20512821]\n", " [0.73076923 0.5 0.14102564 0.01282051]\n", " [0.73076923 0.32051282 0.5 0.14102564]\n", " [0.8974359 0.37179487 0.74358974 0.25641026]]\n", "[[0.76923077 0.34615385 0.58974359 0.14102564]\n", " [0.71794872 0.47435897 0.20512821 0.02564103]\n", " [0.97435897 0.32051282 0.87179487 0.28205128]\n", " [0.75641026 0.35897436 0.56410256 0.17948718]\n", " [0.85897436 0.34615385 0.6025641 0.16666667]\n", " [0.67948718 0.42307692 0.17948718 0.03846154]\n", " [0.70512821 0.35897436 0.44871795 0.15384615]\n", " [0.87179487 0.38461538 0.64102564 0.28205128]\n", " [0.78205128 0.26923077 0.56410256 0.17948718]\n", " [0.73076923 0.33333333 0.48717949 0.14102564]\n", " [0.82051282 0.3974359 0.64102564 0.24358974]\n", " [0.6025641 0.37179487 0.16666667 0. ]\n", " [0.69230769 0.43589744 0.15384615 0.01282051]\n", " [0.61538462 0.38461538 0.17948718 0. ]\n", " [0.64102564 0.47435897 0.17948718 0.02564103]\n", " [0.79487179 0.41025641 0.58974359 0.19230769]\n", " [0.82051282 0.37179487 0.73076923 0.26923077]\n", " [0.70512821 0.30769231 0.48717949 0.12820513]\n", " [0.71794872 0.34615385 0.56410256 0.15384615]\n", " [0.80769231 0.34615385 0.70512821 0.26923077]\n", " [0.58974359 0.3974359 0.19230769 0.01282051]\n", " [0.76923077 0.37179487 0.61538462 0.21794872]\n", " [0.62820513 0.42307692 0.19230769 0.03846154]\n", " [0.80769231 0.34615385 0.70512821 0.25641026]\n", " [1. 0.47435897 0.80769231 0.24358974]\n", " [0.84615385 0.37179487 0.65384615 0.28205128]\n", " [0.84615385 0.30769231 0.73076923 0.21794872]\n", " [0.85897436 0.3974359 0.74358974 0.28205128]\n", " [0.6025641 0.37179487 0.16666667 0.02564103]\n", " [0.6025641 0.38461538 0.19230769 0.01282051]\n", " [0.57692308 0.44871795 0.11538462 0.01282051]\n", " [0.71794872 0.55128205 0.17948718 0.03846154]\n", " [0.84615385 0.38461538 0.55128205 0.16666667]\n", " [0.6025641 0.42307692 0.19230769 0.01282051]\n", " [0.55128205 0.3974359 0.15384615 0.01282051]\n", " [0.79487179 0.30769231 0.62820513 0.23076923]\n", " [0.80769231 0.3974359 0.56410256 0.17948718]\n", " [0.65384615 0.43589744 0.17948718 0.01282051]\n", " [0.62820513 0.44871795 0.16666667 0.01282051]\n", " [0.65384615 0.51282051 0.17948718 0. ]\n", " [0.73076923 0.33333333 0.64102564 0.23076923]\n", " [0.75641026 0.42307692 0.56410256 0.19230769]\n", " [0.84615385 0.38461538 0.58974359 0.17948718]\n", " [0.67948718 0.48717949 0.15384615 0.03846154]\n", " [0.67948718 0.46153846 0.17948718 0.01282051]\n", " [0.69230769 0.29487179 0.46153846 0.11538462]\n", " [0.79487179 0.34615385 0.64102564 0.17948718]\n", " [0.80769231 0.38461538 0.69230769 0.21794872]\n", " [0.83333333 0.37179487 0.55128205 0.16666667]\n", " [0.91025641 0.44871795 0.76923077 0.30769231]\n", " [0.71794872 0.35897436 0.52564103 0.15384615]\n", " [0.96153846 0.37179487 0.83333333 0.25641026]\n", " [0.70512821 0.37179487 0.56410256 0.17948718]\n", " [0.64102564 0.43589744 0.16666667 0.01282051]\n", " [0.97435897 0.34615385 0.84615385 0.24358974]\n", " [0.73076923 0.33333333 0.51282051 0.11538462]\n", " [0.65384615 0.42307692 0.16666667 0.01282051]\n", " [0.62820513 0.43589744 0.15384615 0.02564103]\n", " [0.64102564 0.47435897 0.23076923 0.03846154]\n", " [0.62820513 0.24358974 0.43589744 0.11538462]]\n", "['Iris-virginica' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'\n", " 'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica'\n", " 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica' 'Iris-setosa'\n", " 'Iris-virginica' 'Iris-virginica' 'Iris-versicolor' 'Iris-versicolor'\n", " 'Iris-virginica' 'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor'\n", " 'Iris-virginica' 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor'\n", " 'Iris-versicolor' 'Iris-setosa' 'Iris-virginica' 'Iris-setosa'\n", " 'Iris-setosa' 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica'\n", " 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica' 'Iris-versicolor'\n", " 'Iris-setosa' 'Iris-setosa' 'Iris-virginica' 'Iris-virginica'\n", " 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor'\n", " 'Iris-virginica' 'Iris-setosa' 'Iris-virginica' 'Iris-virginica'\n", " 'Iris-setosa' 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica'\n", " 'Iris-versicolor' 'Iris-virginica' 'Iris-setosa' 'Iris-virginica'\n", " 'Iris-versicolor' 'Iris-virginica' 'Iris-versicolor' 'Iris-versicolor'\n", " 'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor' 'Iris-versicolor'\n", " 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica'\n", " 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica'\n", " 'Iris-setosa' 'Iris-virginica' 'Iris-setosa' 'Iris-versicolor'\n", " 'Iris-virginica' 'Iris-virginica' 'Iris-versicolor' 'Iris-virginica'\n", " 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica'\n", " 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica' 'Iris-setosa'\n", " 'Iris-versicolor' 'Iris-virginica']\n", "['Iris-versicolor' 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor'\n", " 'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica'\n", " 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica' 'Iris-setosa'\n", " 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor'\n", " 'Iris-virginica' 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica'\n", " 'Iris-setosa' 'Iris-virginica' 'Iris-setosa' 'Iris-virginica'\n", " 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'\n", " 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor'\n", " 'Iris-setosa' 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor'\n", " 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-virginica'\n", " 'Iris-versicolor' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'\n", " 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica' 'Iris-versicolor'\n", " 'Iris-virginica' 'Iris-versicolor' 'Iris-virginica' 'Iris-versicolor'\n", " 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor' 'Iris-setosa'\n", " 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor']\n" ] } ], "source": [ "print(Dev_norm_data_X)\n", "print(test_norm_data_X)\n", "print(Dev_norm_data_y)\n", "print(test_norm_data_y)" ] }, { "cell_type": "code", "execution_count": 208, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\mxm5116\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.\n", " warnings.warn(CV_WARNING, FutureWarning)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Best Hyper Parameters:\n", " {'algorithm': 'auto', 'leaf_size': 1, 'n_jobs': -1, 'n_neighbors': 7, 'weights': 'distance'}\n", "Accuracy: 0.9833333333333333\n", "Confusion Metrix:\n", " [[23 0 0]\n", " [ 0 19 1]\n", " [ 0 0 17]]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\mxm5116\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_search.py:813: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.\n", " DeprecationWarning)\n" ] } ], "source": [ "# Now lets Run KNN with a range of K value and normalized euclidean distance\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn import metrics\n", "\n", "params = {'n_neighbors':[1,3,5,7,9,11,13,15,17,19,21],\n", " 'leaf_size':[1,2,3,5,10,15,20,30],\n", " 'weights':['uniform', 'distance'],\n", " 'algorithm':['auto', 'ball_tree','kd_tree','brute'],\n", " 'n_jobs':[-1]}\n", "Dev_model = KNeighborsClassifier(n_jobs=-1)\n", "KNN_model = GridSearchCV(Dev_model, param_grid=params, n_jobs=1)\n", "KNN_model.fit(Dev_norm_data_X,Dev_norm_data_y)\n", "print(\"Best Hyper Parameters:\\n\",KNN_model.best_params_)\n", "prediction=KNN_model.predict(test_norm_data_X)\n", "print(\"Accuracy:\",metrics.accuracy_score(prediction,test_norm_data_y))\n", "print(\"Confusion Metrix:\\n\",metrics.confusion_matrix(prediction,test_norm_data_y))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# After normalizing the best hyperparameter and accuracy of the algorithm did not change, it is still K=7 and accuracy =98.33, now lets see the step by step in manual" ] }, { "cell_type": "code", "execution_count": 222, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The accuracy of our classifier is 98.33333333333333\n" ] } ], "source": [ "# Now Apply euclidean distance in KNN step by step \n", "def train(Dev_norm_data_X, Dev_norm_data_y):\n", " return\n", "\n", "def predict(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X, k):\n", " \n", "\n", " distances = []\n", " targets = []\n", "\n", " for i in range(len(Dev_norm_data_X)):\n", " distances.append([np.sqrt(np.sum(np.square(test_norm_data_X - Dev_norm_data_X[i, :]))), i])\n", "\n", " distances = sorted(distances)\n", "\n", " # make a list of the k neighbors' targets\n", " for i in range(k):\n", " index = distances[i][1]\n", " targets.append(Dev_norm_data_y[index])\n", "\n", " # return most common target\n", " return Counter(targets).most_common(1)[0][0]\n", "\n", "\n", "def k_nearest_neighbor(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X, k):\n", " # train on the develop data\n", " train(Dev_norm_data_X, Dev_norm_data_y)\n", "\n", " # loop over all observations\n", " predictions = []\n", " for i in range(len(test_norm_data_X)):\n", " predictions.append(predict(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X[i, :], k))\n", "\n", " return np.asarray(predictions)\n", "\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Now check the accuracy for different K values" ] }, { "cell_type": "code", "execution_count": 245, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "accuracy1 98.33333333333333\n", "accuracy3 98.33333333333333\n", "accuracy5 98.33333333333333\n", "accuracy7 98.33333333333333\n", "accuracy9 98.33333333333333\n", "accuracy11 100.0\n", "accuracy13 100.0\n" ] } ], "source": [ "# Now make the predictions, when, k=1\n", "predictions = k_nearest_neighbor(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X,1)\n", "\n", "# Evaluate accuracy\n", "accuracy = accuracy_score(test_norm_data_y, predictions)\n", "print(\"accuracy1 {}\".format(100*accuracy))\n", "\n", "# Now make the predictions, when, k=3\n", "predictions = k_nearest_neighbor(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X,3)\n", "\n", "# Evaluate accuracy\n", "accuracy = accuracy_score(test_norm_data_y, predictions)\n", "print(\"accuracy3 {}\".format(100*accuracy))\n", "\n", "# Now make the predictions, when, k=5\n", "predictions = k_nearest_neighbor(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X,5)\n", "\n", "# Evaluate accuracy\n", "accuracy = accuracy_score(test_norm_data_y, predictions)\n", "print(\"accuracy5 {}\".format(100*accuracy))\n", "\n", "# Now make the predictions, when, k=7\n", "predictions = k_nearest_neighbor(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X,7)\n", "\n", "# Evaluate accuracy\n", "accuracy = accuracy_score(test_norm_data_y, predictions)\n", "print(\"accuracy7 {}\".format(100*accuracy))\n", "\n", "# Now make the predictions, when, k=9\n", "predictions = k_nearest_neighbor(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X,9)\n", "\n", "# Evaluate accuracy\n", "accuracy = accuracy_score(test_norm_data_y, predictions)\n", "print(\"accuracy9 {}\".format(100*accuracy))\n", "\n", "# Now make the predictions, when, k=11\n", "predictions = k_nearest_neighbor(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X,11)\n", "\n", "# Evaluate accuracy\n", "accuracy = accuracy_score(test_norm_data_y, predictions)\n", "print(\"accuracy11 {}\".format(100*accuracy))\n", "\n", "\n", "# Now make the predictions, when, k=13\n", "predictions = k_nearest_neighbor(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X,13)\n", "\n", "# Evaluate accuracy\n", "accuracy = accuracy_score(test_norm_data_y, predictions)\n", "print(\"accuracy13 {}\".format(100*accuracy))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# So, after normalizing our result does not change than euclidean" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Now lets apply the cosine similarity" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [], "source": [ "def load_dataset(data, split):\n", " training_set = []\n", " test_set = []\n", " with open(data, 'r') as csvfile:\n", " lines = csv.reader(csvfile)\n", " dataset = list(lines)\n", " for x in range(len(dataset)):\n", " for y in range(4):\n", " dataset[x][y] = float(dataset[x][y])\n", " if random.random() < split:\n", " training_set.append(dataset[x])\n", " else:\n", " test_set.append(dataset[x])\n", "\n", " return training_set, test_set" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [], "source": [ "def cosine_distance(instance1, instance2):\n", " p1 = instance1[:-1]\n", " p2 = instance2[:-1]\n", " return cosine(p1, p2)\n" ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [], "source": [ "def get_neighbors(training_set, test_instance, k):\n", " distances = []\n", " length = len(test_instance) \n", " for x in range(len(training_set)):\n", " dist = cosine_distance(test_instance, training_set[x])\n", " distances.append((training_set[x], dist))\n", " distances.sort(key=operator.itemgetter(1))\n", " \n", " for x in range(k):\n", " neighbors = []\n", " neighbors.append(distances[x][0])\n", " return neighbors" ] }, { "cell_type": "code", "execution_count": 125, "metadata": {}, "outputs": [], "source": [ "def get_response(neighbors):\n", " \n", " class_votes = {}\n", " for x in range(len(neighbors)):\n", " response = neighbors[x][-1]\n", " class_votes[response] = 1\n", " sorted_votes = sorted(class_votes.items(), key=operator.itemgetter(1), reverse=True)\n", " return sorted_votes[0][0]" ] }, { "cell_type": "code", "execution_count": 128, "metadata": {}, "outputs": [], "source": [ "def get_accuracy(test_set, predictions):\n", " correct = 0\n", " for x in range(len(test_set)):\n", " if test_set[x][-1] == predictions[x]:\n", " correct += 1\n", " return (correct / float(len(test_set))) * 100.0" ] }, { "cell_type": "code", "execution_count": 227, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train set: 90\n", "Test set: 60\n", "> predicted='Iris-setosa', actual='Iris-setosa'\n", "> predicted='Iris-setosa', actual='Iris-setosa'\n", "> predicted='Iris-setosa', actual='Iris-setosa'\n", "> predicted='Iris-setosa', actual='Iris-setosa'\n", "> predicted='Iris-setosa', actual='Iris-setosa'\n", "> predicted='Iris-setosa', actual='Iris-setosa'\n", "> predicted='Iris-setosa', actual='Iris-setosa'\n", "> predicted='Iris-setosa', actual='Iris-setosa'\n", "> predicted='Iris-setosa', actual='Iris-setosa'\n", "> predicted='Iris-setosa', actual='Iris-setosa'\n", "> predicted='Iris-setosa', actual='Iris-setosa'\n", "> predicted='Iris-setosa', actual='Iris-setosa'\n", "> predicted='Iris-setosa', actual='Iris-setosa'\n", "> predicted='Iris-setosa', actual='Iris-setosa'\n", "> predicted='Iris-setosa', actual='Iris-setosa'\n", "> predicted='Iris-setosa', actual='Iris-setosa'\n", "> predicted='Iris-versicolor', actual='Iris-versicolor'\n", "> predicted='Iris-versicolor', actual='Iris-versicolor'\n", "> predicted='Iris-versicolor', actual='Iris-versicolor'\n", "> predicted='Iris-virginica', actual='Iris-versicolor'\n", "> predicted='Iris-versicolor', actual='Iris-versicolor'\n", "> predicted='Iris-virginica', actual='Iris-versicolor'\n", "> predicted='Iris-versicolor', actual='Iris-versicolor'\n", "> predicted='Iris-versicolor', actual='Iris-versicolor'\n", "> predicted='Iris-versicolor', actual='Iris-versicolor'\n", "> predicted='Iris-versicolor', actual='Iris-versicolor'\n", "> predicted='Iris-versicolor', actual='Iris-versicolor'\n", "> predicted='Iris-virginica', actual='Iris-versicolor'\n", "> predicted='Iris-versicolor', actual='Iris-versicolor'\n", "> predicted='Iris-virginica', actual='Iris-versicolor'\n", "> predicted='Iris-virginica', actual='Iris-versicolor'\n", "> predicted='Iris-versicolor', actual='Iris-versicolor'\n", "> predicted='Iris-versicolor', actual='Iris-versicolor'\n", "> predicted='Iris-versicolor', actual='Iris-versicolor'\n", "> predicted='Iris-versicolor', actual='Iris-versicolor'\n", "> predicted='Iris-versicolor', actual='Iris-versicolor'\n", "> predicted='Iris-versicolor', actual='Iris-versicolor'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "> predicted='Iris-virginica', actual='Iris-virginica'\n", "Accuracy: 91.66666666666666%\n" ] } ], "source": [ "# prepare data\n", "\n", "split = 0.60\n", "training_set, test_set = load_dataset('iris1.csv', split)\n", "print('Train set: ' + repr(len(training_set)))\n", "print('Test set: ' + repr(len(test_set)))\n", "predictions = [] \n", "for x in range(len(test_set)):\n", " neighbors = get_neighbors(training_set, test_set[x], 7)\n", " result = get_response(neighbors)\n", " predictions.append(result)\n", " print('> predicted=' + repr(result) + ', actual=' + repr(test_set[x][-1]))\n", "accuracy = get_accuracy(test_set, predictions)\n", "print('Accuracy: ' + repr(accuracy) + '%')" ] }, { "cell_type": "code", "execution_count": 236, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Lets draw the final accuracy bar chart for different distance metric with best k=7\n", "objects=('Euclidean','Normalized Euclidean','Cosine')\n", "y_pos = np.arange(len(objects))\n", "performance = [98.33,98.33,91.66]\n", "plt.bar(y_pos, performance, align='center', alpha=0.5,color='red')\n", "plt.xticks(y_pos, objects)\n", "plt.ylabel('Performance')\n", "plt.title('Performance of different Distance Metric with best Hyperparameter,k=7 in KNN for Iris Data set')\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Final comment: I found that for the iris data, best hyperparameter(k) is 7 and Euclidean distance metric performs better (98.33%) than Cosine distance(91.67%). There is no difference in performance for Euclidean and Normalized Euclidean in KNN." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "References:\n", " 01. https://www.kaggle.com/mayu0116/hyper-parameters-tuning-of-dtree-rf-svm-knn\n", " 02. https://kevinzakka.github.io/2016/07/13/k-nearest-neighbor/\n", " 03. https://github.com/dtroupe18/SimpleKNN/blob/master/knn.py\n", " " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }